{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch\n",
    "import torch.nn as nn\n",
    "import torch.nn.functional as F\n",
    "import pandas as pd\n",
    "import jieba\n",
    "import os\n",
    "from torch.nn import init\n",
    "from torchtext import data\n",
    "from torchtext.vocab import Vectors\n",
    "import time\n",
    "\n",
    "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# data processing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 分词\n",
    "def tokenizer(text): \n",
    "    return [word for word in jieba.lcut(text) if word not in stop_words]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 去停用词\n",
    "def get_stop_words():\n",
    "    file_object = open('data/stopwords.txt',encoding='utf-8')\n",
    "    stop_words = []\n",
    "    for line in file_object.readlines():\n",
    "        line = line[:-1]\n",
    "        line = line.strip()\n",
    "        stop_words.append(line)\n",
    "    return stop_words\n",
    "\n",
    "stop_words = get_stop_words()  # 加载停用词表"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "text = data.Field(sequential=True,\n",
    "                  lower=True,\n",
    "                  tokenize=tokenizer,\n",
    "                  stop_words=stop_words)\n",
    "label = data.Field(sequential=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Building prefix dict from the default dictionary ...\n",
      "Dumping model to file cache C:\\Users\\ADMINI~1\\AppData\\Local\\Temp\\jieba.cache\n",
      "Loading model cost 5.798 seconds.\n",
      "Prefix dict has been built successfully.\n"
     ]
    }
   ],
   "source": [
    "train, val = data.TabularDataset.splits(\n",
    "    path='data/',\n",
    "    skip_header=True,\n",
    "    train='train.tsv',\n",
    "    validation='validation.tsv',\n",
    "    format='tsv',\n",
    "    fields=[('index', None), ('label', label), ('text', text)],\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['油耗', '显示', '13', '升', '多一点', '希望', '慢慢', '下降', '倒车', '雷达', '真', '可恨']\n",
      "dict_keys(['label', 'text'])\n"
     ]
    }
   ],
   "source": [
    "print(train[2].text)\n",
    "print(train[5].__dict__.keys())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "#加载Google训练的词向量\n",
    "import gensim\n",
    "model = gensim.models.KeyedVectors.load_word2vec_format('data/myvector.vector', binary=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "cache = 'data/.vector_cache'\n",
    "if not os.path.exists(cache):\n",
    "    os.mkdir(cache)\n",
    "vectors = Vectors(name='data/myvector.vector', cache=cache)\n",
    "# 指定Vector缺失值的初始化方式，没有命中的token的初始化方式\n",
    "#vectors.unk_init = nn.init.xavier_uniform_\n",
    "\n",
    "text.build_vocab(train, val, vectors=vectors)#加入测试集的vertor"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "#text.build_vocab(train, val, vectors=Vectors(name='data/myvector.vector'))#加入测试集的vertor\n",
    "label.build_vocab(train, val)\n",
    "\n",
    "embedding_dim = text.vocab.vectors.size()[-1]\n",
    "vectors = text.vocab.vectors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('空间', 13734),\n",
       " ('外观', 10641),\n",
       " ('满意', 9777),\n",
       " ('车', 8260),\n",
       " ('动力', 7856),\n",
       " ('油耗', 7540),\n",
       " ('高', 6119),\n",
       " ('内饰', 6068),\n",
       " ('感觉', 5383),\n",
       " ('配置', 4596)]"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "torch.Size([34841, 100])\n"
     ]
    }
   ],
   "source": [
    "text.vocab.freqs.most_common(10)\n",
    "print(text.vocab.vectors.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "batch_size=128\n",
    "train_iter, val_iter = data.Iterator.splits(\n",
    "            (train, val),\n",
    "            sort_key=lambda x: len(x.text),\n",
    "            batch_sizes=(batch_size, len(val)), # 训练集设置batch_size,验证集整个集合用于测试\n",
    "    )\n",
    "\n",
    "vocab_size = len(text.vocab)\n",
    "label_num = len(label.vocab)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "torch.Size([28, 128])\n",
      "tensor([[  367, 22789,   287,  ...,  6840,    80,     4],\n",
      "        [   42,  7646,   438,  ...,    17,     1,    21],\n",
      "        [    2,  1777,   193,  ...,   119,     1,     2],\n",
      "        ...,\n",
      "        [    1,     1,     1,  ...,     1,     1,     1],\n",
      "        [    1,     1,     1,  ...,     1,     1,     1],\n",
      "        [    1,     1,     1,  ...,     1,     1,     1]])\n"
     ]
    }
   ],
   "source": [
    "batch = next(iter(train_iter))\n",
    "data = batch.text\n",
    "print(batch.text.shape)\n",
    "print(batch.text)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "metadata": {},
   "outputs": [],
   "source": [
    "class BiLSTM_Attention(nn.Module):\n",
    "    def __init__(self, vocab_size, embedding_dim, num_hiddens, num_layers):\n",
    "        super(BiLSTM_Attention, self).__init__()\n",
    "        # embedding之后的shape: torch.Size([200, 8, 300])\n",
    "        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)\n",
    "        self.word_embeddings = self.word_embeddings.from_pretrained(\n",
    "            vectors, freeze=False)\n",
    "        # bidirectional设为True即得到双向循环神经网络\n",
    "        self.encoder = nn.LSTM(input_size=embedding_dim,\n",
    "                               hidden_size=num_hiddens,\n",
    "                               num_layers=num_layers,\n",
    "                               batch_first=False,\n",
    "                               bidirectional=True)\n",
    "        # 初始时间步和最终时间步的隐藏状态作为全连接层输入\n",
    "        self.w_omega = nn.Parameter(torch.Tensor(\n",
    "            num_hiddens * 2, num_hiddens * 2))\n",
    "        self.u_omega = nn.Parameter(torch.Tensor(num_hiddens * 2, 1))\n",
    "        self.decoder = nn.Linear(2*num_hiddens, 2)\n",
    "\n",
    "        nn.init.uniform_(self.w_omega, -0.1, 0.1)\n",
    "        nn.init.uniform_(self.u_omega, -0.1, 0.1)\n",
    "\n",
    "    def forward(self, inputs):\n",
    "        # inputs的形状是(seq_len,batch_size)\n",
    "        embeddings = self.word_embeddings(inputs)\n",
    "        # 提取词特征，输出形状为(seq_len,batch_size,embedding_dim)\n",
    "        # rnn.LSTM只返回最后一层的隐藏层在各时间步的隐藏状态。\n",
    "        outputs, _ = self.encoder(embeddings)  # output, (h, c)\n",
    "        # outputs形状是(seq_len,batch_size, 2 * num_hiddens)\n",
    "        x = outputs.permute(1, 0, 2)\n",
    "        # x形状是(batch_size, seq_len, 2 * num_hiddens)\n",
    "        \n",
    "        # Attention过程\n",
    "        u = torch.tanh(torch.matmul(x, self.w_omega))\n",
    "       # u形状是(batch_size, seq_len, 2 * num_hiddens)\n",
    "        att = torch.matmul(u, self.u_omega)\n",
    "       # att形状是(batch_size, seq_len, 1)\n",
    "        att_score = F.softmax(att, dim=1)\n",
    "       # att_score形状仍为(batch_size, seq_len, 1)\n",
    "        scored_x = x * att_score\n",
    "       # scored_x形状是(batch_size, seq_len, 2 * num_hiddens)\n",
    "        # Attention过程结束\n",
    "        \n",
    "        feat = torch.sum(scored_x, dim=1)\n",
    "       # feat形状是(batch_size, 2 * num_hiddens)\n",
    "        outs = self.decoder(feat)\n",
    "       # out形状是(batch_size, 2)\n",
    "        return outs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 85,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "BiLSTM_Attention(\n",
      "  (word_embeddings): Embedding(34841, 100)\n",
      "  (encoder): LSTM(100, 64, bidirectional=True)\n",
      "  (decoder): Linear(in_features=128, out_features=2, bias=True)\n",
      ")\n"
     ]
    }
   ],
   "source": [
    "embedding_dim, num_hiddens, num_layers = 100, 64, 1\n",
    "net = BiLSTM_Attention(vocab_size, embedding_dim, num_hiddens, num_layers)\n",
    "print(net)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 86,
   "metadata": {},
   "outputs": [],
   "source": [
    "def evaluate_accuracy(data_iter,net):\n",
    "    acc_sum, n = 0.0, 0\n",
    "    with torch.no_grad():\n",
    "        for batch_idx, batch in enumerate(data_iter):\n",
    "            X, y = batch.text, batch.label\n",
    "         #   X = X.permute(1, 0)\n",
    "            y.data.sub_(1)  #X转置 y为啥要减1\n",
    "            if isinstance(net, torch.nn.Module):\n",
    "                net.eval() # 评估模式, 这会关闭dropout\n",
    "                acc_sum += (net(X).argmax(dim=1) == y).float().sum().item()\n",
    "                net.train() # 改回训练模式\n",
    "            else: # 自定义的模型, 3.13节之后不会用到, 不考虑GPU\n",
    "                if('is_training' in net.__code__.co_varnames): # 如果有is_training这个参数\n",
    "                    # 将is_training设置成False\n",
    "                    acc_sum += (net(X, is_training=False).argmax(dim=1) == y).float().sum().item() \n",
    "                else:\n",
    "                    acc_sum += (net(X).argmax(dim=1) == y).float().sum().item() \n",
    "            n += y.shape[0]\n",
    "    return acc_sum / n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 87,
   "metadata": {},
   "outputs": [],
   "source": [
    "def train(train_iter, test_iter, net, loss, optimizer, num_epochs):\n",
    "    batch_count = 0\n",
    "    for epoch in range(num_epochs):\n",
    "        train_l_sum, train_acc_sum, n, start = 0.0, 0.0, 0, time.time()\n",
    "        for batch_idx, batch in enumerate(train_iter):\n",
    "            X, y = batch.text, batch.label\n",
    "           # X = X.permute(1, 0)\n",
    "            y.data.sub_(1)  #X转置 y为啥要减1\n",
    "            y_hat = net(X)\n",
    "            l = loss(y_hat, y)\n",
    "            optimizer.zero_grad()\n",
    "            l.backward()\n",
    "            optimizer.step()\n",
    "            train_l_sum += l.item()\n",
    "            train_acc_sum += (y_hat.argmax(dim=1) == y).sum().item()\n",
    "            n += y.shape[0]\n",
    "            batch_count += 1\n",
    "        test_acc = evaluate_accuracy(test_iter, net)\n",
    "        print(\n",
    "            'epoch %d, loss %.4f, train acc %.3f, test acc %.3f, time %.1f sec'\n",
    "            % (epoch + 1, train_l_sum / batch_count, train_acc_sum / n,\n",
    "               test_acc, time.time() - start))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 88,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "epoch 1, loss 0.0815, train acc 0.967, test acc 0.904, time 383.3 sec\n",
      "epoch 2, loss 0.0290, train acc 0.978, test acc 0.896, time 382.0 sec\n",
      "epoch 3, loss 0.0176, train acc 0.979, test acc 0.895, time 380.0 sec\n",
      "epoch 4, loss 0.0124, train acc 0.980, test acc 0.895, time 400.7 sec\n",
      "epoch 5, loss 0.0096, train acc 0.981, test acc 0.896, time 437.9 sec\n"
     ]
    }
   ],
   "source": [
    "lr, num_epochs = 0.01, 5\n",
    "optimizer = torch.optim.Adam(net.parameters(), lr=lr)\n",
    "loss = nn.CrossEntropyLoss()\n",
    "train(train_iter, val_iter, net, loss, optimizer, num_epochs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  },
  "varInspector": {
   "cols": {
    "lenName": 16,
    "lenType": 16,
    "lenVar": 40
   },
   "kernels_config": {
    "python": {
     "delete_cmd_postfix": "",
     "delete_cmd_prefix": "del ",
     "library": "var_list.py",
     "varRefreshCmd": "print(var_dic_list())"
    },
    "r": {
     "delete_cmd_postfix": ") ",
     "delete_cmd_prefix": "rm(",
     "library": "var_list.r",
     "varRefreshCmd": "cat(var_dic_list()) "
    }
   },
   "oldHeight": 263.875222,
   "position": {
    "height": "40px",
    "left": "672.097px",
    "right": "20px",
    "top": "40px",
    "width": "350px"
   },
   "types_to_exclude": [
    "module",
    "function",
    "builtin_function_or_method",
    "instance",
    "_Feature"
   ],
   "varInspector_section_display": "none",
   "window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
